################################################################################
# Clean Maryland's 2020 audit data
#
# written by sbaltz at mit
#   may 2023
################################################################################
import pandas as pd
import numpy as np
import copy
import os


################################################################################
# Data cleaning functions
################################################################################
def SplitColByStr(df, oldColName, newColName, theString):
    """Extract substring from one column and insert it into another"""
    #Create a new column that contains the substring wherever it appears
    _newCol = df[oldColName].str.extract(theString, expand=True)
    #Find the locations that are not NA
    _replace_locs = [not _ for _ in list(_newCol.isna()[0])]
    #Insert the non-NA values into the column we want to modify
    df.loc[_replace_locs, newColName] = _newCol[_replace_locs][0]
    return(df)

def CleanCandidate(df):
    """General cleaning of candidate names"""
    df.candidate = df.candidate.str.upper()
    replacements = {
                '\.': '',
                '\(': '"',
                '\)': '"',
                '\,': '',
                '  ': ''
        }
    for r in replacements:
        df.candidate = df.candidate.str.replace(r, replacements[r], regex=True)
    df.candidate = df.candidate.str.strip()
    return(df)

def MergeStateCodes(df):
    state_codes = pd.read_csv('../../help-files/merge_on_statecodes.csv')
    state_codes.state = state_codes.state.str.upper()
    df = df.merge(state_codes, how='left', on='state')
    #Alaska does not have counties
    return(df)

def GenPartySimplified(df):
    """Generate party_simplified given a dataframe containing party_detailed"""
    df['party_detailed'] = df['party_detailed'].fillna('').astype(str)
    allowed = ['DEMOCRAT', 'REPUBLICAN', 'LIBERTARIAN', 'NONPARTISAN', '']
    df['party_simplified'] = df['party_detailed']
    df.loc[~(df.party_simplified.isin(allowed)), 'party_simplified'] = 'OTHER'
    return(df)

def PadDistrict(df):
    df['district'] = df['district'].fillna('').astype(str)
    df['district'] = np.where(df.district.str.isnumeric(),\
                              df.district.str.zfill(3),\
                              df.district
                             )
    return(df)


################################################################################
# Quality of life functions
################################################################################
def QuietPrint(theString):
    """Defines verbosity-aware printing"""
    global STAGEWISE_VERBOSE
    print(theString) if STAGEWISE_VERBOSE else None


################################################################################
# Data reading
################################################################################
md_files = os.listdir(
    '../transcribed')

md_files = [_ for _ in md_files if '~' not in _]

spec_starts = {'03_baltimorecity_COVC_2020pg_Phase 2.csv': 38}

all_res = pd.DataFrame()
for fname in md_files:
    if fname in spec_starts.keys():
        HEADER = spec_starts[fname]
    else:
        HEADER = 10
    county = fname[fname.find('_')+1:fname.find('_COVC')]
    raw = pd.read_csv('../transcribed/' + fname,
                      header = HEADER
                      )
    raw = raw.rename(columns={
                              'Choice': 'candidate',
                              'Precinct': 'precinct',
                              raw.columns[5]: 'original',
                              raw.columns[6]: 'audited'
                    })
    raw = raw[['candidate','precinct','original','audited']]
    raw = raw.dropna(how='all')
    raw['office'] = np.nan
    officeLocs = list(~pd.isna(raw.candidate) &
                         pd.isna(raw.audited))
    raw.loc[officeLocs, 'office'] = raw.loc[officeLocs, 'candidate']
    raw.office = raw.office.fillna(method='ffill')

    raw = raw.loc[~pd.isna(raw.audited)]
    raw = raw.loc[raw.audited.str.isdigit()]
    raw.audited = raw.audited.astype(int)
    raw.original = raw.original.astype(int)

    raw['difference'] = raw.audited - raw.original
    raw['state'] = 'MARYLAND'
    raw['county'] = county
    raw['method'] = 'SOFTWARE'
    raw['mode'] = ''
    raw['party'] = ''

    all_res = pd.concat([all_res, raw], ignore_index=True)


################################################################################
# Cleaning and standardizing
################################################################################
#County names
all_res.county = all_res.county.str.upper()
all_res.county = all_res.county.str.replace('_','')
replacements = {
                'ANNEARUNDEL': 'ANNE ARUNDEL',
                'QUEENANNES': 'QUEEN ANNES',
                'STMARYS': 'ST MARYS',
                'BALTIMORECITY': 'BALTIMORE CITY',
                'PRINCEGEORGES': 'PRINCE GEORGES'
               } 
for r in replacements.keys():
    all_res.loc[all_res.county == r, 'county'] = replacements[r]

#Lots of cleaning left to do, but we will clean it in the merging file


################################################################################
# Write results to disk
################################################################################
all_res.to_csv('../ready/md_cleaned.csv', index=False)

